#load required R libraries
library(nhanesA)
library(forcats)
library(boot)
library(ggplot2)
library(ggpubr)
library(kableExtra)
library(janitor)
library(naniar)
library(patchwork)
library(gridExtra)
library(broom.mixed)
library(Epi)
library(vcd)
library(xfun)
library(epitools)
library(tidyverse)
theme_set(theme_light())
knitr::opts_chunk$set(comment=NA)Exploring Sociodemographic and Health Associations in the NHANES 2017–March 2020 Dataset: Insights from Multivariable and Comparative Analyses
1. Setup and Data Ingest
1.1. Loading R packages
2. Cleaning the Data
2.1. Data preparation
I downloaded the Blood Pressure data and Demographics data from 2017-March 2020 NHANES data
# pull in data from P_BPXO from NHANES and save it
bp_raw <- nhanes('P_BPXO') |> tibble()
demo_raw <- nhanes('P_DEMO') |> tibble()
saveRDS(demo_raw, file = "data/P_DEMO.rds")
saveRDS(bp_raw, file = "data/P_BPXO.rds")
# Now that data are saved, I can just read in the tibble
bp_raw <- readRDS(file = "data/P_BPXO.rds")
demo_raw <- readRDS("data/P_DEMO.rds")
DEMO <- demo_raw%>%
select(., SEQN,RIAGENDR,RIDAGEYR,DMDEDUC2,RIDRETH3,DMDMARTZ )
BPX <- bp_raw%>%
select(., SEQN, BPXOSY1,BPXOSY2)
Analysis_1Data <- left_join(DEMO, BPX, by = "SEQN")
BMX_raw <- nhanes("P_BMX")|> tibble() # Body measurements dataset
saveRDS(BMX_raw, file = "data/P_BMX.rds")
# Merge datasets on SEQN (participant ID)
saveRDS(Analysis_1Data,file = 'data/AnalysisA.rds')
BMX_raw<- readRDS("data/P_BMX.rds")
BMXBMI <- BMX_raw%>%select(SEQN,BMXBMI)
AnalysisB_data <- left_join(Analysis_1Data, BMXBMI, by = "SEQN")
saveRDS(AnalysisB_data, file = "data/AnalysisB_data.rds")
#Analysis C uses same dataset as above
#analysisD data download
SMQ_raw <- nhanes("P_SMQ")
AnalysisB_data <- readRDS('data/AnalysisB_data.rds')
smoke_data <- SMQ_raw%>% select(SEQN,SMQ020)
AnalysisD_data <- left_join(AnalysisB_data, smoke_data, by = "SEQN")
AnalysisData <- AnalysisD_data%>% filter( RIDAGEYR >=21 & RIDAGEYR<= 79)
saveRDS(AnalysisData, file = 'data/AnalysesData.rds')2. 1.1. Data cleaning and filtering for Analysis A
Here, I zeroed down to the Systolic Blood Pressure measurements for sessions 1 and 2 as the paired outcome variables for analysis A. This was co-selected with the SEQN columns just to ensure that systolic blood pressure is paired for each individual in the chosen population.
AnalysisData <- readRDS('data/AnalysesData.rds')
analysisA_data <- AnalysisData%>% select(SEQN,BPXOSY1,BPXOSY2)%>% drop_na()%>%
filter(BPXOSY1 >0, BPXOSY2 > 0)
saveRDS(analysisA_data, file = 'data/AnalysisA.rds')2.1.2. Analysis B Data Cleaning, re-coding and filtering
Here I used already downloaded demographics and downloaded examination datasets (body measurement data) and then combined the two data sets for this analysis.
AnalysesData <- readRDS('data/AnalysesData.rds')
# Select relevant variables and filter complete cases
analysisB_data <-AnalysesData %>%
select(RIAGENDR, BMXBMI)%>%
filter(complete.cases( RIAGENDR, BMXBMI))%>%
mutate(Gender = ifelse(RIAGENDR == "Male", 1,2))%>%
rename(BMI = BMXBMI)# Add readable gender labels
analysisB_data$Gender <- as.factor(analysisB_data$Gender)
analysisB_data <- analysisB_data[,-1]
#taking alot at the summary stats of the analysis B data
saveRDS(analysisB_data, "data/analysisB.rds")2.1.3. Analysis C Data Cleaning, re-coding, and filtering
AnalysesData <- readRDS('data/AnalysesData.rds')
analysisC_data <- AnalysesData%>%
select(SEQN,RIDRETH3,BMXBMI)%>%rename(Race = RIDRETH3, BMI = BMXBMI)
analysisC_data <- analysisC_data%>%
mutate(Race = factor(
Race,
levels = c("Mexican American","Other Hispanic","Non-Hispanic White", "Non-Hispanic Black","Non-Hispanic Asian"),
labels = c("Mexican","Hispanic", "White","Black", "Asian")))%>%filter(complete.cases(Race,BMI))%>% filter(BMI > 0)
length(unique(analysisC_data$BMI))[1] 443
saveRDS(analysisC_data, 'data/analysisC.rds')2.1.4. Analysis D Data cleaning, filtering and re-coding
# Filter and recode SMQ020 for 'Yes' and 'No' only
AnalysesData <- readRDS('data/AnalysesData.rds')
analysisD_data <- AnalysesData %>% rename(Smoking = SMQ020, BMI = BMXBMI)%>%
select(Smoking,BMI) %>% # Select relevant columns
filter(Smoking %in% c('Yes', 'No'), complete.cases(Smoking,BMI)) %>% filter(BMI > 0)%>%# Keep 'Yes' and 'No'.
mutate(
Smoking = ifelse(Smoking =="Yes","Smoker","Non_Smoker"), # Recode 'Yes' to 'Smoker' and 'No' to 'Non-Smoker'
Obesity = ifelse(BMI >= 30, "Obese", "Non_Obese") # Define obesity based on BMI
)%>%
mutate(Smoking = factor(Smoking, levels = c("Smoker","Non_Smoker")),
Obesity = factor(Obesity, levels = c("Obese","Non_Obese")))
saveRDS(analysisD_data, 'data/analysisD.rds')2.1.5 Analysis E Data cleaning, filtering and re-coding
AnalysesData <- readRDS('data/AnalysesData.rds')
analysisE_data <- AnalysesData %>%
select(SEQN,RIDRETH3,DMDEDUC2) %>%
filter( RIDRETH3 %in% c('Mexican American', 'Other Hispanic','Non-Hispanic White','Non-Hispanic Black','Non-Hispanic Asian'),DMDEDUC2 %in% c('Less than 9th grade','9-11th grade (Includes 12th grade with no diploma)','High school graduate/GED or equivalent','Some college or AA degree','College graduate or above'), complete.cases(RIDRETH3,DMDEDUC2))%>%
mutate(Race = factor(
RIDRETH3,
levels = c("Mexican American","Other Hispanic","Non-Hispanic White", "Non-Hispanic Black","Non-Hispanic Asian"),
labels = c("Mexican","Hispanic", "White","Black", "Asian")),
Education = factor(
DMDEDUC2,
levels = c("Less than 9th grade","9-11th grade (Includes 12th grade with no diploma)", "High school graduate/GED or equivalent", "Some college or AA degree","College graduate or above"),
labels = c( "less_9th_grade","9_12th_grade", "High_school","Some_College", "College_graduate" )
))%>% select(Race, Education)
saveRDS(analysisE_data, 'data/analysisE.rds')3. Codebook and Data Description
variable_description <- tibble::tribble(
~Variable_Name, ~Role_in_Analysis,~Type, ~Original_Code, ~Definition, ~Cutpoints_or_Units, ~Year_Measured,
"SEQN", "ID", "numeric", " Respondent sequence number", "Unique Identifier to identify any persons involved in the study" , "N/A", "2017-March 2020",
"Gender", "Analysis B Variable", "Categorical", "RIAGENDR","Gender of the participant","Persons (Male or Female)", "2017-March 2020",
"RIDAGEYR", "Used to select Adult persons aged 21-79 years","numeric","RIDAGEYR", "Participant's Age in years at the time of screening ", "21 -79 years","2017-March 2020", "Education", "Analysis E variable", "Categorical" , "DMDEDUC2", "This variable is the highest grade or level of education completed by adults 20 years and older.
The response categories are: less than 9th grade education, 9-11th grade education (includes 12th grade and no diploma), High school graduate/GED, some college or associates (AA) degree, and college graduate or higher.", "Education level - Adults 20+, from less than 9th-grade to College graduate", "2017-March 2020",
"Race/Ethnicity", "Analysis E variable", "Categorical", "RIDRETH3 ", "RIDRETH3: This is the race-ethnicity variable included in the demographics file since the 2011-2012 survey cycle to accommodate the oversample of Asian Americans. It was derived from responses to the survey questions on race and Hispanic origin.
Respondents who self-identified as “Mexican American” were coded as such (i.e., RIDRETH3=1) regardless of their other race-ethnicity identities. Otherwise, self-identified “Hispanic” ethnicity would result in code “2, Other Hispanic” in the RIDRETH3 variable.
All other non-Hispanic participants would then be categorized based on their self-reported races: non-Hispanic white (RIDRETH3=3), non-Hispanic black (RIDRETH3=4), non-Hispanic Asian (RIDRETH3=6)", "Number of participants who identified as Race/Hispanic origin w/ NH Asian", "2017-March 2020",
"BPXOSY1 ", "Analysis A-variable","numeric", "BPXOSY1", "Systolic - 1st oscillometric reading", "(mm Hg)", "2017-March 2020",
"BPXOSY2","Analysis A-variable", "numeric", "BPXOSY2", "Systolic - 2nd oscillometric reading","(mm Hg)", "2017-March 2020",
"BMI","Analysis variable", "numeric", "BMXBMI", "Body Mass Index (BMI) was calculated as weight in kilograms divided by height in meters squared, and then rounded to one decimal place.", "(kg/m**2)", "2017-March 2020",
"Smoking", "Analysis D-variable", "Categorical(yes, no)", "SMQ020", "Smoked at least 100 cigarettes in life, (yes for smoked, no never smoked","Number of participants interviewed", "2017-March 2020")
variable_description |> kbl() |> kable_styling()| Variable_Name | Role_in_Analysis | Type | Original_Code | Definition | Cutpoints_or_Units | Year_Measured |
|---|---|---|---|---|---|---|
| SEQN | ID | numeric | Respondent sequence number | Unique Identifier to identify any persons involved in the study | N/A | 2017-March 2020 |
| Gender | Analysis B Variable | Categorical | RIAGENDR | Gender of the participant | Persons (Male or Female) | 2017-March 2020 |
| RIDAGEYR | Used to select Adult persons aged 21-79 years | numeric | RIDAGEYR | Participant's Age in years at the time of screening | 21 -79 years | 2017-March 2020 |
| Education | Analysis E variable | Categorical | DMDEDUC2 | This variable is the highest grade or level of education completed by adults 20 years and older. The response categories are: less than 9th grade education, 9-11th grade education (includes 12th grade and no diploma), High school graduate/GED, some college or associates (AA) degree, and college graduate or higher. | Education level - Adults 20+, from less than 9th-grade to College graduate | 2017-March 2020 |
| Race/Ethnicity | Analysis E variable | Categorical | RIDRETH3 | RIDRETH3: This is the race-ethnicity variable included in the demographics file since the 2011-2012 survey cycle to accommodate the oversample of Asian Americans. It was derived from responses to the survey questions on race and Hispanic origin. Respondents who self-identified as “Mexican American” were coded as such (i.e., RIDRETH3=1) regardless of their other race-ethnicity identities. Otherwise, self-identified “Hispanic” ethnicity would result in code “2, Other Hispanic” in the RIDRETH3 variable. All other non-Hispanic participants would then be categorized based on their self-reported races: non-Hispanic white (RIDRETH3=3), non-Hispanic black (RIDRETH3=4), non-Hispanic Asian (RIDRETH3=6) | Number of participants who identified as Race/Hispanic origin w/ NH Asian | 2017-March 2020 |
| BPXOSY1 | Analysis A-variable | numeric | BPXOSY1 | Systolic - 1st oscillometric reading | (mm Hg) | 2017-March 2020 |
| BPXOSY2 | Analysis A-variable | numeric | BPXOSY2 | Systolic - 2nd oscillometric reading | (mm Hg) | 2017-March 2020 |
| BMI | Analysis variable | numeric | BMXBMI | Body Mass Index (BMI) was calculated as weight in kilograms divided by height in meters squared, and then rounded to one decimal place. | (kg/m**2) | 2017-March 2020 |
| Smoking | Analysis D-variable | Categorical(yes, no) | SMQ020 | Smoked at least 100 cigarettes in life, (yes for smoked, no never smoked | Number of participants interviewed | 2017-March 2020 |
4. Analysis A
4.1. Research Question
4.1.1.Problem statement
Research has shown that systolic blood pressure (SBP) is a critical measure for diagnosing hypertension and assessing cardiovascular risk. Repeated measurements are usually taken to improve accuracy and account for variability caused by circumstances such as white coat hypertension (temporary rise in blood pressure due to clinical environment), measurement error or technical variability and physiological changes during the measurement process. And therefore understanding the consistency or difference between successive SBP measurements can inform the protocol used for measuring blood pressure by highlighting any systematic bias in measurements and also help in identifying persons who at a risk of developing hypertension.
4.1.2. Research Question:
Is there a significant difference between the first and second oscillometric systolic blood pressure measurements (BPXOSY1 and BPXOSY2) in individuals as recorded in the NHANES 2017–March 2020 dataset?.
4.1.3. Hypotheses
4.1.3.1 Null Hypothesis \(({H_o})\)
There is no significant difference between the mean of the first systolic blood pressure measurement (BPXOSY1) and the second systolic blood pressure measurement (BPXOSY2)
\[{H_o} : {μ_{BPXOSY1} = μ_{BPXOSY2}}\]
4.1.3.2. Alternative Hypothesis
There is a significant difference between the mean of the first systolic blood pressure measurement (BPXOSY1) and the second systolic blood pressure measurement (BPXOSY2).
\[{H_A} : {μ_{BPXOSY1} \neq μ_{BPXOSY2}}\]
4.2. Describing The Data
Refer to the codebook in section 3 for the description of the variables (BPXOSY1 and BPXOSY2) used in this analysis. Here, I ran a data exploratory analysis and described the distribution of each of the two variables by looking at their statistical summaries (mean, median, sd, interquartile range, minimum, and maximum) and continued to provide some visualization graphics including histogram, boxplots, and Q-Q plots to provide a better understanding of the data.
analysisA_data <- readRDS('data/AnalysisA.rds')# read the
summary(analysisA_data %>% select(BPXOSY1,BPXOSY2)) BPXOSY1 BPXOSY2
Min. : 66.0 Min. : 69.0
1st Qu.:111.0 1st Qu.:111.0
Median :121.0 Median :121.0
Mean :124.3 Mean :124.1
3rd Qu.:135.0 3rd Qu.:135.0
Max. :225.0 Max. :222.0
4.2.1. Visualization
In order to have a clear understanding of the data variables in terms data distribution and checking for normality, I used the Histogram, Boxplot, and the Q-Q plot to display the data.
4.2.1.1. Histogram and Boxplot
From the visualizations above, the data
par(mfrow =c(2,2))
ggplot(analysisA_data, aes(x = log(BPXOSY1))) +
geom_histogram(fill = "slateblue", col = "white",
bins = 20) +
labs(title = "LogTransformed-Systolic BP First Reading in `NHANES`",
x = "log(Systolic BP First Reading (mm Hg))")+
theme_classic(base_size = 18) +
theme(
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))ggplot(analysisA_data, aes(x = log(BPXOSY2))) +
geom_histogram(fill = "orange", col = "white",
bins = 20) +
labs(title = "Systolic BP Second Reading in `NHANES`",
x = "log(Systolic BP Second Reading(mm Hg))")+
theme_classic(base_size = 18) +
theme(
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))ggplot(analysisA_data, aes(x = "", y = log(BPXOSY1))) +
geom_boxplot(fill = "slateblue", outlier.size = 2,
outlier.color = "slateblue") +
#coord_flip() +
labs(y = "log(Systolic BP First Reading (mm Hg))")+
theme_classic(base_size = 14) +
theme(
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))ggplot(analysisA_data, aes(x = "", y = log(BPXOSY2)))+
geom_boxplot(fill = "orange", outlier.size = 2,
outlier.color = "orange") +
#coord_flip() +
labs(y = "log(Systolic BP Second Reading(mm Hg))")+
theme_classic(base_size = 14) +
theme(
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))ggsave("LogTransHistogram_and_Boxplots.png")4.2.1.2. Q-Q Plot
par(mfrow = c(1, 2))
ggplot(analysisA_data, aes(sample = log(BPXOSY1))) +
geom_qq(col = "slateblue") +
geom_qq_line(col = "magenta") +
theme(aspect.ratio = 1) +
labs(title = "Normal Q-Q: NHANES log(BPXOSY1)")+
theme_classic(base_size = 18) +
theme(
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))ggplot(analysisA_data, aes(sample = log(BPXOSY2))) +
geom_qq(col = "orange") +
geom_qq_line(col = "black") +
theme(aspect.ratio = 1) +
labs(title = "Normal Q-Q: NHANES log(BPXOSY2)")+ theme_classic(base_size = 18) +
theme(
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))ggsave("Q-QPlots_for_systolic_blood_pressure.png")From the Q-Q plot we see that our data is still right skewed even after log-transformation and therefore does not satisfy the normality requirements and hence I will apply non-parametric methods to compare the paired means
4.2.1.3 Scatter Plot Paired Sample Box plots
# Scatter plot of paired data
ggplot(analysisA_data, aes(x = log(BPXOSY1), y = log(BPXOSY2))) +
geom_point(alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Systolic Blood Pressure (First Reading vs Second Reading)",
x = "BPXOSY1 (First Reading)", y = "BPXOSY2 (Second Reading)") +
theme_classic(base_size = 18) +
theme(
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))# Paired boxplot
analysisA_data %>% mutate(log_sbp1 = log(BPXOSY1), log_sbp2 = log(BPXOSY2))%>%
pivot_longer(cols = c(log_sbp1, log_sbp2), names_to = "ReadingSession", values_to = "SystolicBloodPressure") %>%
ggplot(aes(x = ReadingSession, y = SystolicBloodPressure, fill = ReadingSession)) +
geom_boxplot() +
labs(title = "Distribution of Blood Pressure (Paired Sessions)",
x = "Reading-Session", y = "Blood Pressure (mmHg)") +
theme_classic(base_size = 18) +
theme(legend.position = "left",
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))ggsave("scatter plot_bp_sessions.png")
ggsave("Boxplot_Blood_Pressure_Sessions.png")
write.csv(analysisA_data, "data/Paired_Systolic_BP_data.csv")4.3. Main Analysis
Here, I used Wilcoxon signed-rank sum test to compare the means of the systolic blood pressure between the two sessions since the data did not meet the normality requirements. Additionally, I used a bootstrap procedure to calculate a confidence interval for the mean difference.
4.3.1. Wilcoxon Signed-Rank Test
# Perform Wilcoxon signed-rank test
wilcox_result <- wilcox.test(log(analysisA_data$BPXOSY1), log(analysisA_data$BPXOSY2), paired = TRUE)
print(wilcox_result)
Wilcoxon signed rank test with continuity correction
data: log(analysisA_data$BPXOSY1) and log(analysisA_data$BPXOSY2)
V = 10992502, p-value = 0.009045
alternative hypothesis: true location shift is not equal to 0
4.3.2. Bootstrap Method
# Define bootstrap function
bootstrap_diff <- function(data, indices) {
sample_data <- data[indices, ]
mean(log(sample_data$BPXOSY1) - log(sample_data$BPXOSY2))
}
# Run bootstrap
set.seed(123)
boot_result <- boot(data = analysisA_data, statistic = bootstrap_diff, R = 1000)
# Confidence interval
boot_ci <- boot.ci(boot_result, type = "perc")
print(boot_ci)BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 1000 bootstrap replicates
CALL :
boot.ci(boot.out = boot_result, type = "perc")
Intervals :
Level Percentile
95% ( 0.0004, 0.0026 )
Calculations and Intervals on Original Scale
4.4. Conclusions
From the results of the Wilcoxon signed rank, We observe that The p-value is very small (0.009), which is far below the common significance level (α=0.05), which implies that the true median difference between Systolic blood pressure values for the 2 oscillometric readings is not equal to zero. And hence there is a statistically significant difference in the paired samples (BPXOSY1 and BPXOSY2). The bootstrap analysis further confirms that there is a significant difference between the paired samples. The mean difference is estimated to be between 0.0004 and 0.0026 with 95% confidence, which is above zero. This supports the conclusion that the two paired systolic blood pressure are statistically different.
5. Analysis B
5.1. Question
5.1.1. Problem Statement
Body Mass Index (BMI) is a widely used indicator for assessing body fat based on height and weight. Gender differences in BMI may arise due to biological, hormonal, and societal factors influencing body composition, metabolism, lifestyle behaviors among others. Understanding how BMI varies with gender within a population may provide us with insights on how this association may influence the gender based health outcomes within the United States population.
5.1.2. Research Question: Is there any association between Body Mass Index and Gender?
5.1.3. Hypotheses
5.1.3.1. Null Hypothesis \(({H_o})\)
There is no association between Body Mass Index (BMI) and Gender in the United States population. The mean BMI of males is equal to the mean BMI for females; \[{H_o} : {μ_{MaleBMI} = μ_{FemaleBMI}}\]
5.1.3.2. Alternative Hypothesis \(({H_A})\):
There is an association between BMI and Gender in the United States population. The mean BMI for males is not equal to the mean BMI for female; \[{H_A} : {μ_{MaleBMI} \neq μ_{FemaleBMI}}\]
5.2. Describing The Data
Refer to the codebook in section 3 for the description of the variables (BMI) and Gender) used in this analysis. Here, I ran a data exploratory analysis and described the distribution of each of the two variables by looking at their statistical summaries (mean, median, sd, interquartile range, minimum, and maximum) and continued to provide some visualization graphics including histogram, boxplots, and Q-Q plots to provide a better understanding of the data.
5.2.1. Data summaries for the chosen variables for Analysis B
analysisB_data <- readRDS("data/analysisB.rds")
analysisB_data %>%
group_by(Gender) %>%
summarise(
Mean_BMI = mean(BMI),
SD_BMI = sd(BMI),
IQR_BMI = IQR(BMI),
n = n()
)%>% kable(digits = 3)%>% kable_styling()| Gender | Mean_BMI | SD_BMI | IQR_BMI | n |
|---|---|---|---|---|
| 1 | 29.571 | 6.631 | 7.8 | 3732 |
| 2 | 30.858 | 8.493 | 10.3 | 4004 |
5.2.2. Visualization of the data distribution
5.6.1 Boxplot, Histogram, Density plot, and Q-Q Plot
Here, I use a histogram, Q-Q plot, Boxplot, and the density plot to visualize the data distribution and assessment for normality.
par(mfrow = c(2, 2))
ggplot(analysisB_data, aes(x = BMI)) +
geom_histogram(fill = "slateblue", col = "white",
bins = 20) +
labs(title = "Body Mass Index Distribution in `NHANES`",
x = "Body Mass Index")+
theme_classic(base_size = 18) +
theme(legend.position = "left",
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))ggplot(analysisB_data, aes(sample = BMI)) +
geom_qq(col = "slateblue") +
geom_qq_line(col = "magenta") +
theme(aspect.ratio = 1) +
labs(title = "Normal Q-Q: NHANES BMI")+
theme_classic(base_size = 18) +
theme(legend.position = "none",
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))#Boxplot
ggplot(analysisB_data, aes(x = Gender, y = BMI, fill = Gender)) +
geom_boxplot() + coord_flip()+
labs(title = "Comparison of BMI by Gender",
x = "Gender",
y = "Body Mass Index (BMI)") +
theme_classic(base_size = 18) +
theme(legend.position = "left",
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))#density plot
ggplot(analysisB_data, aes(x = BMI, fill = Gender)) +
geom_density(alpha = 0.5) +
labs(title = "BMI Distribution by Gender",
x = "Body Mass Index (BMI)",
y = "Density") +
theme_classic(base_size = 18) +
theme(legend.position = "left",
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18))5.3. Main Analysis
From the histogram and the Q-Q Normal plot, we observe that Body Mass index is right skewed and hence does not satisfy the normality requirement and hence in the subsequent analyses, I will use non parametric methods such as Wilcoxon rank-sum test and Bootstrap for calculating the confidence intervals.
5.3.1. Wilcoxon Rank-Sum Test
set.seed(112724)
#perform Wilcoxon Rank-sum test
wilcox_test = wilcox.test(BMI ~ Gender, data = analysisB_data)
glance(wilcox_test)%>% kable(digits = 3)%>% kable_styling()| statistic | p.value | method | alternative |
|---|---|---|---|
| 6984618 | 0 | Wilcoxon rank sum test with continuity correction | two.sided |
5.3.2. Using Bootstrap Confidence Interval
I created function to use to compute the mean difference using the bootstrap method
#Define a bootstrap function
bootstrap_mean_diff <- function(data, indices){
sample_data <- data[indices,]
mean(sample_data$BMI[sample_data$Gender=="1"])- mean(sample_data$BMI[sample_data$Gender=="2"])
}Running the bootstrap to compute the 90% confidence intervals for the independent sample mean difference.
# the bootstrap
set.seed(2024)
boot_result <- boot(data = analysisB_data, statistic = bootstrap_mean_diff, R =1000)
boot_ci = boot.ci(boot_result, type = "perc", conf = 0.90)
print(boot_ci)BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 1000 bootstrap replicates
CALL :
boot.ci(boot.out = boot_result, conf = 0.9, type = "perc")
Intervals :
Level Percentile
90% (-1.555, -1.014 )
Calculations and Intervals on Original Scale
5.3.3. Visualize the Results
I used the Boxplot/Violin plot to display the mean difference for better comparison
ggplot(analysisB_data, aes(x = Gender, y = BMI, fill = Gender)) +
geom_violin(trim = FALSE, alpha = 0.4, color = "coral", lwd = 0.5) +
geom_boxplot(width = 0.4, outlier.shape = NA, alpha = 0.7, color = "gray20") +
stat_summary(fun = mean, geom = "point", shape = 23, size = 4, color = "black", fill = "yellow") +
stat_summary(fun = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = 0.3, color = "black")+
coord_flip() +
labs(title = "Comparison of BMI by Gender",
caption = "Data Source: NHANES 2017-March 2020",
x = "Gender",
y = "Body Mass Index (BMI)") +
theme_classic(base_size = 18) +
theme(legend.position = "none",
axis.text = element_text(face = "bold",size = 18),
axis.title.y = element_text(face = "bold", size = 18),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5,size = 18)) +
scale_fill_brewer(palette = "Paired")5.4. Conclusions
The mean BMI for males was 29.571(sd = 6.631), and the mean BMI for females was 30.858 (sd = 8.493). The Wilcoxon rank-sum test indicated that there is a significant difference between mean BM1 between males and females (p <0.05). Bootstrap resampling provided a confidence interval of [-1.555, -1.014], since the confidence bounds lie below 0 and are both negative indicate that the male group has a significantly lower mean than the female group.
6. Analysis D
6.1. Question
6.1.1. Problem Statement
Smoking and obesity are two significant risk factors for numerous chronic diseases, including cardiovascular diseases, diabetes, and certain cancers. Investigating the relationship between these two factors is essential to understand whether smoking status (current smoker vs. non-smoker) is associated with obesity status (obese vs. non-obese). By dividing the population into four distinct groups based on these binary categorical variables, we aim to assess the interaction between smoking and obesity and evaluate any potential public health implications.
6.1.2. Research Question
Is there a significant association between smoking status and obesity status in the US population, as represented by the NHANES 2017-March 2020 data?
6.1.3. Hypotheses
6.1.3.1 Null Hypothesis \(({H_o})\)
There is no association between smoking status (current smoker vs. non-smoker) and obesity status (obese vs. non-obese) in the population. The observed frequencies in the 2x2 table are consistent with the assumption of independence.
6.1.3.2. Alternative hypothesis \(({H_A})\)
There is a significant association between smoking status (current smoker vs. non-smoker) and obesity status (obese vs. non-obese) in the population. The observed frequencies in the 2x2 table deviate significantly from what is expected under the assumption of independence.
6.2. Describing The Data
Refer to the codebook in section 3 for the description of the variable SMQ020 and Body Mass Index (BMXBMI). However, for this analysis BMI was converted into a binary variable with people having BMI > 30 considered to be obese and those with BMI < 30 considered non-obese and as earlier descibed smoking was binarized into non-smoker and smoker. Here, I ran a data exploratory analysis and described the distribution of each of the two variables by looking at their statistical summaries, 2x2 table and continued to provide some visualization graphics including bar plot, mosaic plot, and boxplots to provide a better understanding of the data.
6.2.1. Data Summaries
analysisD_data |>
tabyl(Smoking,Obesity) |>
adorn_totals("row") |>
adorn_percentages("row") |>
adorn_pct_formatting() |>
adorn_ns(position = "front")%>% kable()%>%kable_styling()| Smoking | Obese | Non_Obese |
|---|---|---|
| Smoker | 1,442 (44.8%) | 1,775 (55.2%) |
| Non_Smoker | 1,948 (43.1%) | 2,567 (56.9%) |
| Total | 3,390 (43.8%) | 4,342 (56.2%) |
6.2.2. Data Visualization
6.2.2.1. Bar plot
analysisD_data <- readRDS("data/analysisD.rds")
ggplot(analysisD_data, aes(x = Smoking, fill = Obesity))+
geom_bar(position = "dodge")+
labs(
title = "Smoking Status vs. Obesity",
x = "Smoking Status",
y = "Count",
fill = "Obesity"
)+
theme_classic(base_size = 16) +
theme(legend.position = "right",
axis.title.y = element_text(face = "bold", size = 18),
axis.text = element_text(face ="bold", size = 16),
axis.title.x = element_text(face = "bold", size = 18),
plot.title = element_text(face = "bold", hjust = 0.5, size =))The analysis examined the association between smoking status (smoker vs. non-smoker) and obesity (obese vs. non-obese) using NHANES 2017-March 2020 data. The contingency table revealed a total of3217 smokers and 4515 non-smokers, 3390 obese people and 4342 non-obese people.
6.2.2.2. Mosaicplot from Contingency Table
contigency_table <- table(analysisD_data$Smoking,analysisD_data$Obesity)
mosaicplot(
contigency_table,
main ="Mosaic Plot of Smoking and Obesity", color = TRUE
)Now, it’s helpful to show our 2 x 2 table. We see there were 1,442 (44.8%) of obese smokers among 3217 smokers and 1948 ( 43.1%) obese people among 4515 non-smokers.
6.3. Main Analysis
6.3.1. Chi-square test
#contigency_table <- analysisD_data %>% tabyl(Smoking, Obesity)
chiSq_test <-chisq.test(contigency_table)
glance(chiSq_test) %>% kable(digits = 3) %>% kable_styling()| statistic | p.value | parameter | method |
|---|---|---|---|
| 2.084 | 0.149 | 1 | Pearson's Chi-squared test with Yates' continuity correction |
6.3.2. Compute relative risk
Finally, we can obtain our estimate of relative risk, by pushing the relevant data into the Epi package’s twoby2 function, using a 90% confidence interval, as requested.
twoby2(analysisD_data$Smoking, analysisD_data$Obesity, conf.level = 0.90)2 by 2 table analysis:
------------------------------------------------------
Outcome : Obese
Comparing : Smoker vs. Non_Smoker
Obese Non_Obese P(Obese) 90% conf. interval
Smoker 1442 1775 0.4482 0.4339 0.4627
Non_Smoker 1948 2567 0.4315 0.4194 0.4436
90% conf. interval
Relative Risk: 1.0389 0.9955 1.0843
Sample Odds Ratio: 1.0705 0.9918 1.1556
Conditional MLE Odds Ratio: 1.0705 0.9907 1.1568
Probability difference: 0.0168 -0.0020 0.0356
Exact P-value: 0.1431
Asymptotic P-value: 0.1424
------------------------------------------------------
The risk of being Obese for smokers relative to non-smokers is 1.0389 (90% CI:0.9955,1.0843), which implies that smokers are 3.89% more likely to be obese compared to non-smokers and since the 90% confidence intervals includes 1, this result is not statistically significant.
The Odds of being obese for smokers relative to non-smokers is 1.0705 (90% CI: 0.9918,1.1556). This implies that smokers have 7.05% higher odds of being obese compared to non-smokers. The confidence interval includes 1, indicating no statistical significance. The conditional maximum likelihood estimate odds ratio (MLE Odds Ratio) is 1.0705 (90%CI:0.9907,1.1568). This is consistent with the sample Odds Ratio, confirming that there is no statistically significant association.
Additionally, the absolute difference in probability of being obese between smokers and non-smokers is 0.0168(90%CI: -0.002,0.0356), which highlights that smokers have a 1.68% higher probability of being obese compared to non-smokers. Further highlighting that there is no statistically significant association between smoking status and obesity as is also t indicated by exact p-value (p =0.1431) and asymptotic p-value (p=0.1424) being greater than 0.05 and the chi-square test (p=0.149).
6.4. Conclusions.
From the statistical analysis test conducted under this test analysis, we did not observe a statistically significant association between smoking status and obesity. However, this association may be influenced by other confounding factors such as lifestyle, physical activity, diet and other socio-economic factors.
7. Analysis E
7.1. Question
7.1.1. Research Question
Is there a statistically significant relationship between race/ethnicity and education level in the NHANES population using a statistical analysis of a contingency table?
7.1.2. Problem Statement
Understanding the relationship between race/ethnicity and education level is critical for addressing systemic disparities in access to educational opportunities. Education level is a key determinant of socioeconomic status, health outcomes, and overall quality of life. Similarly, race and ethnicity often intersect with structural barriers that shape access to education. Examining this relationship can provide insights into equity gaps and inform policy interventions.
7.1.3. Hypothesis
7.1.3.1. Null Hypothesis \(({H_0})\):
There is no association between race/ethnicity and education level in the population represented by the 2017-March 2020 NHANES population data. The observed frequencies in the J x K table are consistent with what would be expected if the variables were independent.
7.1.3.2. Alternative Hypothesis \(({H_A})\):
There is an association between race/ethnicity and education level in the population. The observed frequencies in the J x K table deviate significantly from what would be expected under the assumptions of independence.
7.2. Describing The Data
The data used for this analysis has been described in the codebook in section 3. However, this analysis unravels the association between two categorical variables of race/ethnicity and education level in the NHANES 2017-March 2020 population data set. Race/ethnicity is a categorical variable with 5 levels including Mexican, Hispanic, White, Black and American re-coded from the 6 levels originally present in the RIDRETH3 variable whereas Education level is also a categorical variable with 5 levels too, which have been re-coded to less_9th grade, 9_12th_grade, High_school, Some_college, and College_graduate, describing the 5 of the 6 levels of the original ‘DMDEDUC2’ variable in the 2017-March 2020 NHANES dataset.
7.2.1. Data summary
analysisE_data <-readRDS('data/analysisE.rds')
summary(analysisE_data) Race Education
Mexican :1008 less_9th_grade : 628
Hispanic: 889 9_12th_grade : 881
White :2693 High_school :1905
Black :2337 Some_College :2566
Asian :1074 College_graduate:2021
From the summary statistics above we observed that there were more non-Hispanic white people than any other race in the dataset and the least represented group were other Hispanic. Notably, there were more people with a Some college degree followed by college graduates and there fewer people having a below 9th grade education.
analysisE_data |>
tabyl(Race,Education) |>
adorn_totals("row") |>
adorn_percentages("row") |>
adorn_pct_formatting() |>
adorn_ns(position = "front")%>% kable()%>%kable_styling()| Race | less_9th_grade | 9_12th_grade | High_school | Some_College | College_graduate |
|---|---|---|---|---|---|
| Mexican | 266 (26.4%) | 170 (16.9%) | 230 (22.8%) | 250 (24.8%) | 92 (9.1%) |
| Hispanic | 182 (20.5%) | 133 (15.0%) | 182 (20.5%) | 245 (27.6%) | 147 (16.5%) |
| White | 51 (1.9%) | 219 (8.1%) | 679 (25.2%) | 1,007 (37.4%) | 737 (27.4%) |
| Black | 52 (2.2%) | 289 (12.4%) | 672 (28.8%) | 875 (37.4%) | 449 (19.2%) |
| Asian | 77 (7.2%) | 70 (6.5%) | 142 (13.2%) | 189 (17.6%) | 596 (55.5%) |
| Total | 628 (7.8%) | 881 (11.0%) | 1,905 (23.8%) | 2,566 (32.1%) | 2,021 (25.3%) |
It was observed that Mexican American had the least education with at least 26.4% having 9th grade or below whereas Non-Hispanic Asians were the most educated with at least 55.5% of all Non-Hispanic Asian being a college graduate. This can be clearly visualized on the bar plot and the mosaic plots below.
7.2.2. Visualization
7.2.2.1 Bar plot
I used the Bar graph to show the distribution of of education level frequency/count across the different races.
ggplot(analysisE_data, aes(x = Race, fill = Education))+
geom_bar(position = "dodge")+
labs(
title = "Education level by Race/Ethnicity",
x = "Race",
y = "Count",
fill = "Education level"
)+
theme_classic(base_size = 16) +
theme(legend.position = "left",
axis.title.y = element_text(face = "bold", size = 16),
axis.text = element_text(face ="bold", size = 16),
axis.title.x = element_text(face = "bold", size = 16),
plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
plot.subtitle = element_text(hjust = 0.5,size = 16))7.2.2.2 Mosaic plot
# Create a mosaic plot with `vcd::mosaic`
mosaic(
~ Race + Education,
data = as.data.frame(analysisE_data),
shade = TRUE,
labeling_args = list(
gp_labels = gpar(fontsize = 20), # Increase cell label size
gp_varnames = gpar(fontsize = 20), # Increase axis labels size
gp_main = gpar(fontsize = 20) # Increase title size
),
main = "Race vs Education Level"
)7.3. Main Analysis
Basically, I applied the Chi-Square Test of independence to assess whether there is a significant association between race/ethnicity and education level in the population using the data as described
7.3.1. Chi-Square T test of Independence
analysisE_contigency_table <- analysisE_data %>% tabyl(Race, Education)
cell_counts <- as.data.frame(analysisE_contigency_table)
analysisE_chisq <- chisq.test(analysisE_contigency_table)
analysisE_chisq %>% glance() %>% kable()%>%kable_styling()| statistic | p.value | parameter | method |
|---|---|---|---|
| 1702.674 | 0 | 16 | Pearson's Chi-squared test |
residuals <- analysisE_chisq$stdres
print(residuals) Race less_9th_grade 9_12th_grade High_school Some_College College_graduate
Mexican 23.4109832 6.350916 -0.7910144 -5.289147 -12.608953
Hispanic 14.8438507 3.990123 -2.4778125 -3.057076 -6.349636
White -14.1079856 -5.859656 2.1002162 7.265012 3.090912
Black -12.0149837 2.487403 6.6712304 6.610652 -7.995896
Asian -0.8899613 -5.055874 -8.7556165 -10.921523 24.508110
7.3.2. Convert residuals to a Heatmap
residuals_df <- as.data.frame(residuals)
colnames(residuals_df) <- c("Race", "Education", "Residual")
ggplot(residuals_df, aes(x = Education, y = Race, fill = Residual)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
labs(title = "Standardized Residuals Heatmap", x = "Education Level", y = "Race") +
theme_minimal()From the Chi-square residual heatmap, we observe that the Non-Hispanic population is overrepresented in attaining higher education as indicated by large positive residual (red), also we see that the Non-Hispanic Black population is overrepresented at lower education category while being underrepresented in the higher education category. Non-Hispanic Asians are under shown to be less represented at lower education levels suggesting they are less likely to fall into the lower education category whereas Mexican American and Other Hispanic groups vary, but with significant deviations in some education levels highlighting over-representation or underrepresentation.
7.4. Conclusions
These findings suggest that there is a significant relationship between race/ethnicity and education levels, which supports for further exploration of the systemic or cultural factors driving these disparities.
8. Session Information
sessionInfo()R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] lubridate_1.9.3 stringr_1.5.1 dplyr_1.1.4
[4] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
[7] tibble_3.2.1 tidyverse_2.0.0 epitools_0.5-10.1
[10] xfun_0.47 vcd_1.4-12 Epi_2.53
[13] broom.mixed_0.2.9.6 gridExtra_2.3 patchwork_1.2.0
[16] naniar_1.1.0 janitor_2.2.0 kableExtra_1.4.0
[19] ggpubr_0.6.0 ggplot2_3.5.1 boot_1.3-30
[22] forcats_1.0.0 nhanesA_1.1
loaded via a namespace (and not attached):
[1] tidyselect_1.2.1 viridisLite_0.4.2 farver_2.1.2
[4] fastmap_1.2.0 digest_0.6.37 timechange_0.3.0
[7] lifecycle_1.0.4 survival_3.6-4 magrittr_2.0.3
[10] compiler_4.4.1 rlang_1.1.4 tools_4.4.1
[13] utf8_1.2.4 yaml_2.3.10 data.table_1.16.0
[16] knitr_1.48 ggsignif_0.6.4 labeling_0.4.3
[19] htmlwidgets_1.6.4 curl_6.0.1 RColorBrewer_1.1-3
[22] plyr_1.8.9 xml2_1.3.6 abind_1.4-5
[25] withr_3.0.1 foreign_0.8-86 numDeriv_2016.8-1.1
[28] fansi_1.0.6 colorspace_2.1-1 future_1.34.0
[31] globals_0.16.3 scales_1.3.0 MASS_7.3-60.2
[34] cli_3.6.3 etm_1.1.1 rmarkdown_2.28
[37] ragg_1.3.2 generics_0.1.3 rstudioapi_0.16.0
[40] tzdb_0.4.0 httr_1.4.7 splines_4.4.1
[43] rvest_1.0.4 parallel_4.4.1 selectr_0.4-2
[46] vctrs_0.6.5 Matrix_1.7-0 jsonlite_1.8.8
[49] carData_3.0-5 car_3.1-2 hms_1.1.3
[52] rstatix_0.7.2 visdat_0.6.0 listenv_0.9.1
[55] systemfonts_1.1.0 cmprsk_2.2-12 glue_1.7.0
[58] parallelly_1.38.0 codetools_0.2-20 stringi_1.8.4
[61] gtable_0.3.5 lmtest_0.9-40 munsell_0.5.1
[64] furrr_0.3.1 pillar_1.9.0 htmltools_0.5.8.1
[67] R6_2.5.1 textshaping_0.4.0 evaluate_0.24.0
[70] lattice_0.22-6 highr_0.11 backports_1.5.0
[73] broom_1.0.6 snakecase_0.11.1 Rcpp_1.0.13
[76] svglite_2.1.3 nlme_3.1-164 mgcv_1.9-1
[79] zoo_1.8-12 pkgconfig_2.0.3